[IA64] per vcpu vhpt
authorawilliam@xenbuild.aw <awilliam@xenbuild.aw>
Mon, 9 Oct 2006 00:55:12 +0000 (18:55 -0600)
committerawilliam@xenbuild.aw <awilliam@xenbuild.aw>
Mon, 9 Oct 2006 00:55:12 +0000 (18:55 -0600)
Implement per vcpu vhpt option. allocate VHPT per vcpu.
added compile time option, xen_ia64_pervcpu_vhpt=y, to enable it.
Its default is on.
added xen boot time option, pervcpu_vhpt=0, to disable it.

This patch focuses on vcpu migration between physical cpus
becaseu vcpu is heavily migrated with credit scheduler.
This patch tries to reduce vTLB flush when vcpu is migrated

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
xen/arch/ia64/Rules.mk
xen/arch/ia64/vmx/vmx_entry.S
xen/arch/ia64/xen/domain.c
xen/arch/ia64/xen/regionreg.c
xen/arch/ia64/xen/vhpt.c
xen/include/asm-ia64/domain.h
xen/include/asm-ia64/vhpt.h
xen/include/asm-ia64/xenkregs.h

index e11a791758d68f8a4bc86032d73efd9f17614445..9d943ffe313c30d31c9a5a689a3e09602a17d064 100644 (file)
@@ -6,6 +6,7 @@ HAS_VGA  := y
 VALIDATE_VT    ?= n
 no_warns ?= n
 xen_ia64_expose_p2m    ?= y
+xen_ia64_pervcpu_vhpt  ?= y
 
 ifneq ($(COMPILE_ARCH),$(TARGET_ARCH))
 CROSS_COMPILE ?= /usr/local/sp_env/v2.2.5/i686/bin/ia64-unknown-linux-
@@ -40,6 +41,9 @@ endif
 ifeq ($(xen_ia64_expose_p2m),y)
 CFLAGS += -DCONFIG_XEN_IA64_EXPOSE_P2M
 endif
+ifeq ($(xen_ia64_pervcpu_vhpt),y)
+CFLAGS += -DCONFIG_XEN_IA64_PERVCPU_VHPT
+endif
 ifeq ($(no_warns),y)
 CFLAGS += -Wa,--fatal-warnings -Werror -Wno-uninitialized
 endif
index 53b00d90193b30f0a0adb30e1872efa781f901bd..fa2a53670fe6396f946d6584c09ab5bf3873ecd1 100644 (file)
@@ -669,7 +669,7 @@ GLOBAL_ENTRY(vmx_switch_rr7)
 
    // re-pin mappings for guest_vhpt
 
-   mov r24=IA64_TR_PERVP_VHPT
+   mov r24=IA64_TR_VHPT
    movl r25=PAGE_KERNEL
    ;;
    or loc5 = r25,loc5          // construct PA | page properties
index d489299768e50fd3f5fb6001970ea9ba65286d53..4414e0ef45237bbc2dff914358cec0218171e968 100644 (file)
@@ -69,6 +69,16 @@ DEFINE_PER_CPU(int *, current_psr_ic_addr);
 
 #include <xen/sched-if.h>
 
+static void
+ia64_disable_vhpt_walker(void)
+{
+       // disable VHPT. ia64_new_rr7() might cause VHPT
+       // fault without this because it flushes dtr[IA64_TR_VHPT]
+       // (VHPT_SIZE_LOG2 << 2) is just for avoid
+       // Reserved Register/Field fault.
+       ia64_set_pta(VHPT_SIZE_LOG2 << 2);
+}
+
 static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
 {
        int cpu = smp_processor_id();
@@ -92,8 +102,10 @@ static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
                if (VMX_DOMAIN(vcpu)) {
                        // currently vTLB for vt-i domian is per vcpu.
                        // so any flushing isn't needed.
+               } else if (HAS_PERVCPU_VHPT(vcpu->domain)) {
+                       // nothing to do
                } else {
-                       vhpt_flush();
+                       local_vhpt_flush();
                }
                local_flush_tlb_all();
                perfc_incrc(flush_vtlb_for_context_switch);
@@ -111,9 +123,9 @@ void schedule_tail(struct vcpu *prev)
                              current->processor);
        } else {
                ia64_set_iva(&ia64_ivt);
-               ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
-                       VHPT_ENABLED);
+               ia64_disable_vhpt_walker();
                load_region_regs(current);
+               ia64_set_pta(vcpu_pta(current));
                vcpu_load_kernel_regs(current);
                __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
                  shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
@@ -127,7 +139,6 @@ void schedule_tail(struct vcpu *prev)
 void context_switch(struct vcpu *prev, struct vcpu *next)
 {
     uint64_t spsr;
-    uint64_t pta;
 
     local_irq_save(spsr);
 
@@ -164,9 +175,9 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
 
        nd = current->domain;
        if (!is_idle_domain(nd)) {
-               ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
-                            VHPT_ENABLED);
+               ia64_disable_vhpt_walker();
                load_region_regs(current);
+               ia64_set_pta(vcpu_pta(current));
                vcpu_load_kernel_regs(current);
                vcpu_set_next_timer(current);
                if (vcpu_timer_expired(current))
@@ -180,8 +191,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
                 * walker. Then all accesses happen within idle context will
                 * be handled by TR mapping and identity mapping.
                 */
-               pta = ia64_get_pta();
-               ia64_set_pta(pta & ~VHPT_ENABLED);
+               ia64_disable_vhpt_walker();
                __ia64_per_cpu_var(current_psr_i_addr) = NULL;
                __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
         }
@@ -270,6 +280,13 @@ struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
            if (!d->arch.is_vti) {
                int order;
                int i;
+               // vti domain has its own vhpt policy.
+               if (HAS_PERVCPU_VHPT(d)) {
+                       if (pervcpu_vhpt_alloc(v) < 0) {
+                               free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
+                               return NULL;
+                       }
+               }
 
                /* Create privregs page only if not VTi. */
                order = get_order_from_shift(XMAPPEDREGS_SHIFT);
@@ -312,6 +329,8 @@ struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
 
 void relinquish_vcpu_resources(struct vcpu *v)
 {
+    if (HAS_PERVCPU_VHPT(v->domain))
+        pervcpu_vhpt_free(v);
     if (v->arch.privregs != NULL) {
         free_xenheap_pages(v->arch.privregs,
                            get_order_from_shift(XMAPPEDREGS_SHIFT));
@@ -347,6 +366,11 @@ static void init_switch_stack(struct vcpu *v)
        memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
 }
 
+#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
+static int opt_pervcpu_vhpt = 1;
+integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
+#endif
+
 int arch_domain_create(struct domain *d)
 {
        int i;
@@ -361,6 +385,11 @@ int arch_domain_create(struct domain *d)
        if (is_idle_domain(d))
            return 0;
 
+#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
+       d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
+       DPRINTK("%s:%d domain %d pervcpu_vhpt %d\n",
+               __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
+#endif
        d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
        if (d->shared_info == NULL)
            goto fail_nomem;
index 58c89201fbe6739ca886025239703802bfe42294..612aced10592921ad30d0ee4ff3d05946a369e0f 100644 (file)
@@ -260,7 +260,7 @@ int set_one_rr(unsigned long rr, unsigned long val)
        } else if (rreg == 7) {
                ia64_new_rr7(vmMangleRID(newrrv.rrval),v->domain->shared_info,
                             v->arch.privregs, v->domain->arch.shared_info_va,
-                            __get_cpu_var(vhpt_paddr));
+                            vcpu_vhpt_maddr(v));
        } else {
                set_rr(rr,newrrv.rrval);
        }
index b439ccda42cdc85de4e2ef5aed0aa4bcfe1e638a..a8220da1e148cd2bbbff77e2b50ddfeeb2526db3 100644 (file)
@@ -3,6 +3,10 @@
  *
  * Copyright (C) 2004 Hewlett-Packard Co
  *     Dan Magenheimer <dan.magenheimer@hp.com>
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *                    per vcpu vhpt support
  */
 #include <linux/config.h>
 #include <linux/kernel.h>
@@ -24,18 +28,32 @@ extern long running_on_sim;
 DEFINE_PER_CPU (unsigned long, vhpt_paddr);
 DEFINE_PER_CPU (unsigned long, vhpt_pend);
 
-void vhpt_flush(void)
+static void
+ __vhpt_flush(unsigned long vhpt_maddr)
 {
-       struct vhpt_lf_entry *v = __va(__ia64_per_cpu_var(vhpt_paddr));
+       struct vhpt_lf_entry *v = (struct vhpt_lf_entry*)__va(vhpt_maddr);
        int i;
 
        for (i = 0; i < VHPT_NUM_ENTRIES; i++, v++)
                v->ti_tag = INVALID_TI_TAG;
 }
 
-static void vhpt_erase(void)
+void
+local_vhpt_flush(void)
+{
+       __vhpt_flush(__ia64_per_cpu_var(vhpt_paddr));
+}
+
+static void
+vcpu_vhpt_flush(struct vcpu* v)
+{
+       __vhpt_flush(vcpu_vhpt_maddr(v));
+}
+
+static void
+vhpt_erase(unsigned long vhpt_maddr)
 {
-       struct vhpt_lf_entry *v = (struct vhpt_lf_entry *)VHPT_ADDR;
+       struct vhpt_lf_entry *v = (struct vhpt_lf_entry*)__va(vhpt_maddr);
        int i;
 
        for (i = 0; i < VHPT_NUM_ENTRIES; i++, v++) {
@@ -47,17 +65,6 @@ static void vhpt_erase(void)
        // initialize cache too???
 }
 
-
-static void vhpt_map(unsigned long pte)
-{
-       unsigned long psr;
-
-       psr = ia64_clear_ic();
-       ia64_itr(0x2, IA64_TR_VHPT, VHPT_ADDR, pte, VHPT_SIZE_LOG2);
-       ia64_set_psr(psr);
-       ia64_srlz_i();
-}
-
 void vhpt_insert (unsigned long vadr, unsigned long pte, unsigned long logps)
 {
        struct vhpt_lf_entry *vlfe = (struct vhpt_lf_entry *)ia64_thash(vadr);
@@ -102,7 +109,7 @@ void vhpt_multiple_insert(unsigned long vaddr, unsigned long pte, unsigned long
 
 void vhpt_init(void)
 {
-       unsigned long paddr, pte;
+       unsigned long paddr;
        struct page_info *page;
 #if !VHPT_ENABLED
        return;
@@ -122,14 +129,51 @@ void vhpt_init(void)
        __get_cpu_var(vhpt_pend) = paddr + (1 << VHPT_SIZE_LOG2) - 1;
        printf("vhpt_init: vhpt paddr=0x%lx, end=0x%lx\n",
                paddr, __get_cpu_var(vhpt_pend));
-       pte = pte_val(pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL));
-       vhpt_map(pte);
-       ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
-               VHPT_ENABLED);
-       vhpt_erase();
+       vhpt_erase(paddr);
+       // we don't enable VHPT here.
+       // context_switch() or schedule_tail() does it.
 }
 
+#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
+int
+pervcpu_vhpt_alloc(struct vcpu *v)
+{
+       unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
+
+       v->arch.vhpt_entries =
+               (1UL << vhpt_size_log2) / sizeof(struct vhpt_lf_entry);
+       v->arch.vhpt_page =
+               alloc_domheap_pages(NULL, vhpt_size_log2 - PAGE_SHIFT, 0);
+       if (!v->arch.vhpt_page)
+               return -ENOMEM;
+       
+       v->arch.vhpt_maddr = page_to_maddr(v->arch.vhpt_page);
+       if (v->arch.vhpt_maddr & ((1 << VHPT_SIZE_LOG2) - 1))
+               panic("pervcpu_vhpt_init: bad VHPT alignment!\n");
+
+       v->arch.pta.val = 0; // to zero reserved bits
+       v->arch.pta.ve = 1; // enable vhpt
+       v->arch.pta.size = VHPT_SIZE_LOG2;
+       v->arch.pta.vf = 1; // long format
+       //v->arch.pta.base = __va(v->arch.vhpt_maddr) >> 15;
+       v->arch.pta.base = VHPT_ADDR >> 15;
+
+       vhpt_erase(v->arch.vhpt_maddr);
+       smp_mb(); // per vcpu vhpt may be used by another physical cpu.
+       return 0;
+}
 
+void
+pervcpu_vhpt_free(struct vcpu *v)
+{
+       free_domheap_pages(v->arch.vhpt_page, VHPT_SIZE_LOG2 - PAGE_SHIFT);
+}
+#endif
+
+// SMP: we can't assume v == current, vcpu might move to another physical cpu.
+// So memory barrier is necessary.
+// if we can guranttee that vcpu can run on only this physical cpu
+// (e.g. vcpu == current), smp_mb() is unnecessary.
 void vcpu_flush_vtlb_all(struct vcpu *v)
 {
        if (VMX_DOMAIN(v)) {
@@ -144,9 +188,14 @@ void vcpu_flush_vtlb_all(struct vcpu *v)
                /* First VCPU tlb.  */
                vcpu_purge_tr_entry(&PSCBX(v,dtlb));
                vcpu_purge_tr_entry(&PSCBX(v,itlb));
+               smp_mb();
 
                /* Then VHPT.  */
-               vhpt_flush();
+               if (HAS_PERVCPU_VHPT(v->domain))
+                       vcpu_vhpt_flush(v);
+               else
+                       local_vhpt_flush();
+               smp_mb();
 
                /* Then mTLB.  */
                local_flush_tlb_all();
@@ -176,6 +225,13 @@ void domain_flush_vtlb_all (void)
                if (v->processor == cpu)
                        vcpu_flush_vtlb_all(v);
                else
+                       // SMP: it is racy to reference v->processor.
+                       // vcpu scheduler may move this vcpu to another
+                       // physicall processor, and change the value
+                       // using plain store.
+                       // We may be seeing the old value of it.
+                       // In such case, flush_vtlb_for_context_switch()
+                       // takes care of mTLB flush.
                        smp_call_function_single(v->processor,
                                                 __vcpu_flush_vtlb_all,
                                                 v, 1, 1);
@@ -183,24 +239,42 @@ void domain_flush_vtlb_all (void)
        perfc_incrc(domain_flush_vtlb_all);
 }
 
-static void cpu_flush_vhpt_range (int cpu, u64 vadr, u64 addr_range)
+// Callers may need to call smp_mb() before/after calling this.
+// Be carefull.
+static void
+__flush_vhpt_range(unsigned long vhpt_maddr, u64 vadr, u64 addr_range)
 {
-       void *vhpt_base = __va(per_cpu(vhpt_paddr, cpu));
+       void *vhpt_base = __va(vhpt_maddr);
 
        while ((long)addr_range > 0) {
                /* Get the VHPT entry.  */
                unsigned int off = ia64_thash(vadr) - VHPT_ADDR;
-               volatile struct vhpt_lf_entry *v;
-               v = vhpt_base + off;
+               struct vhpt_lf_entry *v = vhpt_base + off;
                v->ti_tag = INVALID_TI_TAG;
                addr_range -= PAGE_SIZE;
                vadr += PAGE_SIZE;
        }
 }
 
+static void
+cpu_flush_vhpt_range(int cpu, u64 vadr, u64 addr_range)
+{
+       __flush_vhpt_range(per_cpu(vhpt_paddr, cpu), vadr, addr_range);
+}
+
+static void
+vcpu_flush_vhpt_range(struct vcpu* v, u64 vadr, u64 addr_range)
+{
+       __flush_vhpt_range(vcpu_vhpt_maddr(v), vadr, addr_range);
+}
+
 void vcpu_flush_tlb_vhpt_range (u64 vadr, u64 log_range)
 {
-       cpu_flush_vhpt_range (current->processor, vadr, 1UL << log_range);
+       if (HAS_PERVCPU_VHPT(current->domain))
+               vcpu_flush_vhpt_range(current, vadr, 1UL << log_range);
+       else
+               cpu_flush_vhpt_range(current->processor,
+                                    vadr, 1UL << log_range);
        ia64_ptcl(vadr, log_range << 2);
        ia64_srlz_i();
        perfc_incrc(vcpu_flush_tlb_vhpt_range);
@@ -233,8 +307,18 @@ void domain_flush_vtlb_range (struct domain *d, u64 vadr, u64 addr_range)
                if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
                        continue;
 
-               /* Invalidate VHPT entries.  */
-               cpu_flush_vhpt_range (v->processor, vadr, addr_range);
+               if (HAS_PERVCPU_VHPT(d)) {
+                       vcpu_flush_vhpt_range(v, vadr, addr_range);
+               } else {
+                       // SMP: it is racy to reference v->processor.
+                       // vcpu scheduler may move this vcpu to another
+                       // physicall processor, and change the value
+                       // using plain store.
+                       // We may be seeing the old value of it.
+                       // In such case, flush_vtlb_for_context_switch()
+                       /* Invalidate VHPT entries.  */
+                       cpu_flush_vhpt_range(v->processor, vadr, addr_range);
+               }
        }
        // ptc.ga has release semantics.
 
@@ -246,7 +330,7 @@ void domain_flush_vtlb_range (struct domain *d, u64 vadr, u64 addr_range)
 static void flush_tlb_vhpt_all (struct domain *d)
 {
        /* First VHPT.  */
-       vhpt_flush ();
+       local_vhpt_flush ();
 
        /* Then mTLB.  */
        local_flush_tlb_all ();
@@ -255,7 +339,10 @@ static void flush_tlb_vhpt_all (struct domain *d)
 void domain_flush_tlb_vhpt(struct domain *d)
 {
        /* Very heavy...  */
-       on_each_cpu ((void (*)(void *))flush_tlb_vhpt_all, d, 1, 1);
+       if (HAS_PERVCPU_VHPT(d) /* || VMX_DOMAIN(v) */)
+               on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
+       else
+               on_each_cpu((void (*)(void *))flush_tlb_vhpt_all, d, 1, 1);
        cpus_clear (d->domain_dirty_cpumask);
 }
 
index 99cfa7415e97e53173e294f9f1cdfe7b1c248ac0..34e24e5a85c895029d6a19a0d9d21dd2d341d43b 100644 (file)
@@ -87,6 +87,9 @@ struct arch_domain {
         unsigned long flags;
         struct {
             unsigned int is_vti : 1;
+#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
+            unsigned int has_pervcpu_vhpt : 1;
+#endif
         };
     };
 
@@ -142,6 +145,13 @@ struct arch_domain {
     (sizeof(vcpu_info_t) * (v)->vcpu_id + \
     offsetof(vcpu_info_t, evtchn_upcall_mask))
 
+#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
+#define HAS_PERVCPU_VHPT(d)     ((d)->arch.has_pervcpu_vhpt)
+#else
+#define HAS_PERVCPU_VHPT(d)     (0)
+#endif
+
+
 struct arch_vcpu {
     /* Save the state of vcpu.
        This is the first entry to speed up accesses.  */
@@ -193,6 +203,13 @@ struct arch_vcpu {
     struct timer hlt_timer;
     struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */
 
+#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
+    PTA                 pta;
+    unsigned long       vhpt_maddr;
+    struct page_info*   vhpt_page;
+    unsigned long       vhpt_entries;
+#endif
+
 #define INVALID_PROCESSOR       INT_MAX
     int last_processor;
 };
index cb4fc30462f209b8f5d2ab138b76fc4fd37dcdf7..c59d8fd6351c1a469dda7de0b1ebcdf4e030cfef 100644 (file)
@@ -37,11 +37,46 @@ extern void vhpt_multiple_insert(unsigned long vaddr, unsigned long pte,
                                 unsigned long logps);
 extern void vhpt_insert (unsigned long vadr, unsigned long pte,
                         unsigned long logps);
-void vhpt_flush(void);
+void local_vhpt_flush(void);
 
 /* Currently the VHPT is allocated per CPU.  */
 DECLARE_PER_CPU (unsigned long, vhpt_paddr);
 DECLARE_PER_CPU (unsigned long, vhpt_pend);
 
+#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
+#if !VHPT_ENABLED
+#error "VHPT_ENABLED must be set for CONFIG_XEN_IA64_PERVCPU_VHPT"
+#endif
+#endif
+
+#include <xen/sched.h>
+int pervcpu_vhpt_alloc(struct vcpu *v);
+void pervcpu_vhpt_free(struct vcpu *v);
+static inline unsigned long
+vcpu_vhpt_maddr(struct vcpu* v)
+{
+#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
+    if (HAS_PERVCPU_VHPT(v->domain))
+        return v->arch.vhpt_maddr;
+#endif
+
+#if 0
+    // referencecing v->processor is racy.
+    return per_cpu(vhpt_paddr, v->processor);
+#endif
+    BUG_ON(v != current);
+    return __get_cpu_var(vhpt_paddr);
+}
+
+static inline unsigned long
+vcpu_pta(struct vcpu* v)
+{
+#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
+    if (HAS_PERVCPU_VHPT(v->domain))
+        return v->arch.pta.val;
+#endif
+    return VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) | VHPT_ENABLED;
+}
+
 #endif /* !__ASSEMBLY */
 #endif
index dcfaf65d6ba337ce3c1ba0c25fefeae6f358cb3d..d2dcd2bc84b1856a1b92fb28f674e0899b638de0 100644 (file)
@@ -7,8 +7,7 @@
 #define IA64_TR_SHARED_INFO    3       /* dtr3: page shared with domain */
 #define        IA64_TR_VHPT            4       /* dtr4: vhpt */
 #define IA64_TR_MAPPED_REGS    5       /* dtr5: vcpu mapped regs */
-#define IA64_TR_PERVP_VHPT     6
-#define IA64_DTR_GUEST_KERNEL   7
+#define IA64_DTR_GUEST_KERNEL   6
 #define IA64_ITR_GUEST_KERNEL   2
 /* Processor status register bits: */
 #define IA64_PSR_VM_BIT                46